#coding=utf-8
from bs4 import BeautifulSoup
from urllib.request import urlopen
import re;
from HttpClient import httpCall;
from MysqlOperate import queryCityCodeByName;
import json;

'''
爬取携程官网青旅信息的爬虫
'''

def fetch_hotel_info(hotel_id, city_name):
    try:
        hotel_info = {};

        #从web地址获取基本信息
        web_url = "http://hotels.ctrip.com/hotel/" + str(hotel_id) + ".html";
        html_doc = urlopen(web_url);
        soup = BeautifulSoup(html_doc, "html5lib", from_encoding='utf-8');
        #print(soup.html)

        #获取酒店名称
        hotel_name = soup.find(itemprop="name").string;
        hotel_info['name'] = hotel_name;
        #print(hotel_name);

        #获取酒店具体地址
        hotelAddress = soup.find(itemprop="address");
        hotelCity = hotelAddress.find(id="ctl00_MainContentPlaceHolder_commonHead_lnkCity").string;
        hotelZone = hotelAddress.find(id="ctl00_MainContentPlaceHolder_commonHead_lnkLocation").string;
        hotelStreet = hotelAddress.find(id="ctl00_MainContentPlaceHolder_commonHead_lbAddress").string;
        hotelCross = hotelAddress.find(id="ctl00_MainContentPlaceHolder_commonHead_lnkRoadCross").string;
        hotel_address = '';
        if(hotelCity not in hotelStreet):
            hotel_address = hotel_address + hotelCity;
            if(hotelZone is not None and hotelZone not in hotelStreet):
                hotel_address = hotel_address + hotelZone;
        hotel_address = hotel_address + hotelStreet;
        if(hotelCross is not None):
            hotel_address = hotel_address + hotelCross;
        hotel_info['address'] = hotel_address;
        #print(hotel_address);

        #获取酒店经纬度
        hotel_latitude = soup.find(itemprop="latitude")['content'];
        hotel_longitude = soup.find(itemprop="longitude")['content'];
        hotel_info['latitude'] = hotel_latitude;
        hotel_info['longitude'] = hotel_longitude;
        #print(hotel_latitude, hotel_longitude);

        #获取酒店描述\联系电话\装修时间\房间数量
        hotelDesc = soup.find(id="htlDes");
        hotel_desc = hotelDesc.find(id="ctl00_MainContentPlaceHolder_hotelDetailInfo_lbDesc").get_text().strip();
        hotel_phone = hotelDesc.find(id="J_realContact")['data-real'].split()[0][2:];
        #print(hotel_phone);
        hotel_decorate_time = '';
        hotel_rooms_num  = 0 ;
        hotel_other = hotelDesc.get_text('|', strip=True).split('|')[0];
        if(hotel_other is not None):
            for text in hotel_other.split():
                if('开业' in text):
                    hotel_decorate_time = re.findall("[0-9]+", text)[0];
                if('装修' in text):
                    hotel_decorate_time = re.findall("[0-9]+", text)[0];
                if('间' in text):
                    hotel_rooms_num = re.findall("[0-9]+", text)[0];

        hotel_info['desc'] = hotel_desc;
        hotel_info['phone'] = hotel_phone;
        hotel_info['decorate_time'] = hotel_decorate_time;
        hotel_info['rooms_num'] = hotel_rooms_num;

        # print(hotel_desc);
        # print(hotel_phone);
        # print(hotel_decorate_time);
        # print(hotel_rooms_num);

        #从web地址获取不到房间信息,所以从h5地址着手获取
        h5_url = "http://m.ctrip.com/html5/Hotel/HotelDetail/" + str(hotel_id) + ".html";
        h5_doc = urlopen(h5_url);
        soup2 = BeautifulSoup(h5_doc, "html5lib", from_encoding='utf-8');
        hotelRooms = soup2.find(id="js_list");
        rooms = [];
        for room in hotelRooms.find_all('h3'):
            rooms.append(room.string);
        #print(rooms);
        prices = [];
        for price in hotelRooms.select('.js_baseroom_item .price'):
            prices.append(re.findall("[0-9]+", price.get_text())[0]);
        #print(prices);
        hotel_rooms = [];
        for i in range(0, len(rooms)):
            d=dict();
            d['room_name'] = rooms[i];
            d['room_price'] = prices[i];
            hotel_rooms.append(d);
        #print(hotel_rooms);
        hotel_info['rooms_desc'] = hotel_rooms;
        # for d in hotel_rooms:
        #     print(d);

        #青旅的照片收集则采用一个这种折中的方式,因为照片都有水印,所以只需适当的图片用于填充数据即可,无需将所有图片都爬下来.
        hotel_pics = [];
        hotelPics = soup.find(id="topPicList");

        for picDiv in hotelPics.find_all(class_=re.compile("^pic[1-9]+")):
            for pic_div in picDiv.children:
                if pic_div.has_attr('_src') & pic_div.has_attr('style') :
                    d = {};
                    photo_url = pic_div['_src'];
                    small_photo_url = pic_div['style'];
                    d['photo_url'] = photo_url;
                    start = small_photo_url.find('(');
                    end = small_photo_url.find(')');
                    d['small_photo_url'] = small_photo_url[start+1: end];
                    hotel_pics.append(d);
        hotel_info['pics'] = hotel_pics;
        #print(len(hotel_pics));

        hotel_info['id'] = hotel_id;
        hotel_info['cityName'] = city_name;
        convertCity(hotel_info);

        return hotel_info;
    except Exception as e:
        print('html parse error occours, the hotel_id is:' + str(hotel_id) + '   ' + repr(e));
        return None;

'''
因为携程的城市编号和TravelGo的城市编号不一致,因此需要转换城市信息
'''
def convertCity(hotel_info):
    provinceAndCityId = queryCityCodeByName(hotel_info['cityName']);
    hotel_info['province'] = provinceAndCityId['province'];
    hotel_info['city'] = provinceAndCityId['city'];
    return hotel_info;

'''
分页爬取某城市下面的青旅数据
'''
def fetchHotelUrl(param):
    try:
        hotelList = [];

        #设置请求头
        headers = {'cache-control':'no-cache', 'Content-type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Host':'hotels.ctrip.com', 'Origin':'http://hotels.ctrip.com', 'User-Agent':'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.76 Mobile Safari/537.36'};

        url = "http://hotels.ctrip.com/Domestic/Tool/AjaxHotelList.aspx";
        result = httpCall(url, "POST", param, headers);
        #print(result);
        #print(result[0]);
        #print(JSONDecoder.decode(result[1]));
        resultMap = json.loads(result[1]);
        #print(resultMap['hotelPositionJSON']);
        #该城市下的青旅总数量
        total_amount = resultMap['hotelAmount'];
        #该次请求的青旅返回列表
        hotel_list = resultMap['hotelPositionJSON'];
        #追加返回结果
        for hotel in hotel_list:
            if(hotel['name'].find('旅舍') > 0 or hotel['name'].find('旅社') > 0):
                hotel['city'] = param['cityName'];
                hotelList.append(hotel);
        #print(len(hotel_list));
        #print(total_amount);
        #判断是否还有下一页数据,25是携程网页版的每页数量
        if(25 * param['page'] < total_amount):
            param['page'] = param['page'] + 1;
            for hotel in fetchHotelUrl(param):
                if(hotel['name'].find('旅舍') > 0 or hotel['name'].find('旅社') > 0):
                    hotel['city'] = param['cityName'];
                    hotelList.append(hotel);
        return hotelList;
    except Exception as e:
        print('fetch hotel list error, cityName = ' + param['cityName'] + ' ' + repr(e));
        return None;


